/* File: create_matched_cohorts.do
 * Author: Josh Feng, Luca Maini
 * Purpose: creates the matched cohorts for the stacked analysis
 *
 * Date Created: 01/14/2023
 *
 */

* Matched algorithm specs
local matchList1 "age"
local matchList2 "age atc_code (#0)"
local matchList3 "age atc_code (#0) firm_size_cat (#0)"
local matchList4 "age atc_code (#0) log_net_sales_1"
local matchList5 "age atc_code (#0) log_net_sales_1 firm_size_cat (#0)"

* When specifying cutpoints, several automatic methods may be chosen, including 
* “sturges” (Sturges' rule, the default), “fd” (Freedman-Diaconis' rule), 
* “scott” (Scott's rule) and “ss” (Shimazaki-Shinomoto's rule). See references 
* for a description of each rule.
local method "sturges"

* Lists of releveant variables
local excludeList "acqd stlth_same_atc3_acqr"

* Create a variable for firm size
use "${outdir}/firm_total_sales.dta", clear
replace quarter = quarter - 1 // minus 1 to undo the shift in the dataset (see create_acquisition_variables.do)
gen year = yofd(dofq(quarter))
* keep only the last quarter for each year
bysort Company year (quarter) : keep if _n == _N
drop quarter

rename tot_firm_net_sales firm_size
bysort Company : egen max_sales = max(firm_size)
gen firm_size_cat = max_sales < 250
replace firm_size_cat = 2 if max_sales >= 250 & max_sales < 2500
replace firm_size_cat = 3 if max_sales >= 2500
gen log_firm_size = log(firm_size)

drop max_sales firm_size

tempfile firm_size
save `firm_size', replace

* Open data, and create/merge in additional indicators not included in the main 
* file
use "${maindir}\combined_regression_dataset_with_valeant_augmented.dta", clear

* reshape to get individual companies and merge in the firm size data (in cases 
* of multiple marketing companies we assign the size of the largest company)
rename mktCompany Company
split Company, parse(;)
foreach var of varlist Company* {
	replace `var' = trim(`var')
	}

rename Company mktCompany

reshape long Company, i(Product mktCompany year) j(companyTracker)
drop if Company == ""
drop companyTracker

merge m:1 Company year using `firm_size', keep(master match) nogen	// companies that disappear in the middle of 
																	// a year will be unmatched on the using side
drop Company
bysort mktCompany year : egen temp = max(log_firm_size)
bysort mktCompany year : egen temp_cat = max(firm_size_cat)
drop log_firm_size firm_size_cat
rename temp log_firm_size
rename temp_cat firm_size_cat
duplicates drop

* Tag drugs that CANNOT be part of ANY control group (these are treated drugs)			
gen exclude = 0
foreach var of local excludeList {
	qui bysort Product : egen temp = max(`var')
	qui replace exclude = temp if temp == 1
	qui drop temp
	}

* Valeant drugs are also tagged to be excluded
// qui bysort Product : egen temp = max(mktCompany == "VALEANT PHARMACEUTICALS")	// more conservative approach
gen temp = mktCompany == "VALEANT PHARMACEUTICALS"
replace exclude = temp if temp == 1
drop temp

* keep only variables relevant for the match
keep Product year age net_sales log_firm_size firm_size_cat exclude

* generate variables for matching
bysort Product (year) : gen log_net_sales_1 = log(net_sales[_n-1])
drop net_sales

* save for loop
save "${outdir}\data_for_cohort_match.dta", replace

* Main loop: loop over all drugs and years
levelsof Product, local(prodList)

foreach prod of local prodList {
	disp "`prod'"
	local name = strtoname("`prod'")
	
	* open the data to get list of years for given product
	qui use "${outdir}\data_for_cohort_match.dta", clear
	qui levelsof year if Product == "`prod'", local(yList)
	
	* loop over years
	foreach y of local yList {
		
		* years 2007 and 2019 cannot be done because there is no pre- or post-
		* period in the DiD
		if `y' == 2007 | `y' == 2019 continue
		
		* Finally, loop over the five possible match algorithms
		forvalues j = 1/5 {
			
			* erase existing file (if any)
			capture erase "${outdir}\matched_cohorts\temp`name'`y'_`j'.dta"
		
			* Open the data again
			qui use "${outdir}\data_for_cohort_match.dta", clear
			
			* tag "treated" product
			qui gen treated = Product == "`prod'"
			
			* keep only observations in year `y' (don't need other years to match)
			qui keep if year == `y'
			
			* import ATC information
			qui merge 1:m Product using "${inputdir}\ssr_product_atc4_codes.dta", ///
				keep(master match) nogen keepusing(atc4)
			
			* update the exclude variable to include any product with the same 
			* ATC4 as the "treated" product
			qui levelsof atc4 if treated == 1, local(treatedATCs)
			foreach code of local treatedATCs {
				qui bysort Product : egen temp = max(atc4 == "`code'" & treated == 0)
				qui replace exclude = 1 if temp == 1
				drop temp
				}
			
			* grab atc1 indicators for matching (applies 2-5)
			qui gen atc1 = substr(atc4,1,1)
			qui egen atc_code = group(atc1) // numerical version for CEM
			drop atc1 atc4

			* drop drugs that have any year tagged by the exclude variable
			* (except the "treated" product)
			qui drop if exclude == 1 & Product != "`prod'"
			drop exclude

			* If the product in question has missing sales info skip runs 4-5
			if `j' >= 4 {
				qui sum log_net_sales_1 if Product == "`prod'"
				if r(N) == 0 continue
				
				* also drop product-years with missing net sales
				qui drop if log_net_sales_1 == .
				}			
			
			* Run CEM command
			qui cem `matchList`j'', treatment(treated) auto(`method')
			
			* keep only matched controls (and treated)
			qui keep if cem_matched == 1
			qui count
			if r(N) == 0 continue // no matches found
			
			* drop variables we don't need and save
			qui drop age log_net_sales_1 cem_matched cem_strata cem_weights atc_code
			
			* drop duplicate observations (which can occur if a product has 
			* multiple ATC codes)
			qui duplicates drop
			
			* keep track of cem variables
			qui gen cem_varlist = `j'
			
			* identify treated product
			qui gen Treated_Product = Product if treated == 1
			gsort -treated
			qui replace Treated_Product = Treated_Product[_n-1] if Treated_Product == ""
			
			* save
			qui save "${outdir}\matched_cohorts\temp`name'`y'_`j'.dta", replace

			}
		}
	}


* append all files to a unique dataset and save it
clear
foreach prod of local prodList {
	local name = strtoname("`prod'")
	forvalues y = 2007/2019 {
			forvalues j = 1/5 {
			capture append using "${outdir}\matched_cohorts\temp`name'`y'_`j'.dta"
			}
		}
	}

compress

label define cem_varlist_lbl	1 "Match on year, age" ///
								2 "Match on year, age and ATC-1" ///
								3 "Match on year, age, ATC-1, and firm size category" ///
								4 "Match on year, age, ATC-1, and net sales in year prior to acquisition" ///
								5 "Match on year, age, ATC-1, net sales in year prior to acquisition, and firm size category" ///
								
label values cem_varlist cem_varlist_lbl

* This is the main matched cohort dataset
save "${outdir}\matched_cohorts_all.dta", replace
